*********************************
/*	Code for Studying Patent Infringement Decisions */
/* 	Date created: 2022-06-30 */
/*	Author: Jason Rantanen */
*********************************

// This file contains the code associated with the statistics for Studying Patent Infringement Litigation. It consists of two primary sections: (1) one part that prepares the three starting datasets to be linked together and then merges them together, and (2) one part that analyzes and reports on observations from the combined dataset. 

/* Starting files:
	-cases.dta - this is the USPTO Patent Litigation Dataset, available at https://www.uspto.gov/ip-policy/economic-research/research-datasets/patent-litigation-docket-reports-data
	-2022-04-10 CAFC Dockets.dta - this is the Federal Circuit Dataset Project dataset of Federal Circuit Appeal dockets as of 2022-04-10
	-appeals 2022-06-24.dta - this is the Federal Circuit Dataset Project dataset of Federal Circuit documents as of 2022-06-24

	Supplemental files:
		-"2022-06-23 Civil Action hand formatting.xlsx": This is a set of hand-formatted appeal docket number corrections for numbers that could not be automatically formatted
*/



cd "~/CAFC Data" // Set to the working directory

**************************
/* Preparation of the USPTO District Court litigation dataset */
**************************

use "cases.dta", clear // This is the USPTO data file


duplicates report case_number district_id

sort district_id case_number
duplicates tag case_number district_id, gen(dup) // Conduct visual review of these records

duplicates drop case_number district_id, force // The USPTO dataset contains a very small number of duplicate records that the authors kept; however, the additional information is not relevant to this project

drop dup

save "USPTO Patent Litigation Dataset (prepared).dta", replace

**************************
/* Preparation of the Appeal Dockets dataset */
**************************
// This portion of the code standardizes the district court docket numbers in the appeal docket dataset. It is provided as an example of how standardization of docket numbers in different formats can be accompliashed. 

use "2022-04-10 CAFC Dockets.dta", clear

keep if origin=="DCT" // Note that this drops all appeals that we have not coded the origin for.  We have origin data om all appeals for CY 2011 and later.

split OriginatingCase, parse("Lead:") gen(docket_numbers_) // A small number of records have the originating number followed by a lead docket number. We are working with the first group of docket numbers.  

// The approach to format the numbers is to run through each common format, parsing the entry and putting it into a standardized format. 

split docket_numbers_1, parse(:) gen(temp) // This separates out the division prefix

//set 1: old format civil action numbers with "CV"
gen set=1 if strpos(temp2,"CV")>0

split temp1, gen(prefix) parse(-)
drop prefix1

split temp2, parse(-CV-) gen(suffix)

// The older format civil action numbers do not use zeros as placeholders to make a five-digit suffix, and so they need to be processed so as to include the placeholder zeros.

gen suffix_num=suffix2
destring(suffix_num), replace force

drop suffix2

gen suffix2 = suffix_num
tostring(suffix2), replace

gen docket_set1 = prefix2 + ":" + suffix1 + "-cv-" + "0000" + suffix2 if suffix_num<10
replace docket_set1 = prefix2 + ":" + suffix1 + "-cv-" + "000" + suffix2 if suffix_num<100 & docket_set1==""
replace docket_set1 = prefix2 + ":" + suffix1 + "-cv-" + "00" + suffix2 if suffix_num<1000 & docket_set1==""
replace docket_set1 = prefix2 + ":" + suffix1 + "-cv-" + "0" + suffix2 if suffix_num<10000 & docket_set1==""
replace docket_set1 = prefix2 + ":" + suffix1 + "-cv-" + suffix2 if suffix_num<100000 & docket_set1==""

drop prefix2 prefix3 suffix1 suffix_num suffix2
replace docket_set1="" if set!=1

// set 2: new format civil action numbers with "cv"

replace set=2 if strpos(temp3,"cv")>0

gen prefix = temp2
split temp3, parse(-) gen(suffix)

gen suffix_num=suffix3
destring(suffix_num), replace force

drop suffix3

gen suffix3 = suffix_num
tostring(suffix3), replace

gen docket_set2 = prefix + ":" + suffix1 + "-cv-" + "0000" + suffix3 if suffix_num<10
replace docket_set2 = prefix + ":" + suffix1 + "-cv-" + "000" + suffix3 if suffix_num<100 & docket_set2==""
replace docket_set2 = prefix + ":" + suffix1 + "-cv-" + "00" + suffix3 if suffix_num<1000 & docket_set2==""
replace docket_set2 = prefix + ":" + suffix1 + "-cv-" + "0" + suffix3 if suffix_num<10000 & docket_set2==""
replace docket_set2 = prefix + ":" + suffix1 + "-cv-" + suffix3 if suffix_num<100000 & docket_set2==""

drop prefix prefix4 prefix5 suffix1 suffix2 suffix3 suffix4 suffix5 suffix6 suffix_num

replace docket_set2="" if set!=2

// set 3: old format civil action numbers with "MD"

replace set=3 if strpos(temp2,"MD")>0

split temp1, gen(prefix) parse(-)
drop prefix1

split temp2, parse(-MD-) gen(suffix)

gen suffix_num=suffix2
destring(suffix_num), replace force

drop suffix2

gen suffix2 = suffix_num
tostring(suffix2), replace

gen docket_set3 = prefix2 + ":" + suffix1 + "-md-" + "0000" + suffix2 if suffix_num<10
replace docket_set3 = prefix2 + ":" + suffix1 + "-md-" + "000" + suffix2 if suffix_num<100 & docket_set3==""
replace docket_set3 = prefix2 + ":" + suffix1 + "-md-" + "00" + suffix2 if suffix_num<1000 & docket_set3==""
replace docket_set3 = prefix2 + ":" + suffix1 + "-md-" + "0" + suffix2 if suffix_num<10000 & docket_set3==""
replace docket_set3 = prefix2 + ":" + suffix1 + "-md-" + suffix2 if suffix_num<100000 & docket_set3==""

drop prefix2 prefix3 suffix1 suffix_num suffix2
replace docket_set3="" if set!=3

// set 4: new format civil action numbers with "md" // 

replace set=4 if strpos(temp3,"md")>0

gen prefix = temp2
split temp3, parse(-) gen(suffix)
drop suffix4 suffix5 suffix6
gen docket_set4 = prefix + ":" + suffix1 + "-" + suffix2 + "-" + suffix3

drop prefix suffix1 suffix2 suffix3
replace docket_set4="" if set!=4

// set 5: new format civil action numbers with "CV"

replace set=5 if strpos(temp3,"CV")>0

gen prefix = temp2
split temp3, parse(-) gen(suffix)

drop suffix4 suffix5 suffix6
split suffix3, parse(AG (JPRx)) // There is one funky entry that needs to be cleaned up before it can be worked with.

gen suffix_num=suffix31
destring(suffix_num), replace force

drop suffix31 suffix3

gen suffix3 = suffix_num
tostring(suffix3), replace

gen docket_set5 = prefix + ":" + suffix1 + "-cv-" + "0000" + suffix3 if suffix_num<10
replace docket_set5 = prefix + ":" + suffix1 + "-cv-" + "000" + suffix3 if suffix_num<100 & docket_set5==""
replace docket_set5 = prefix + ":" + suffix1 + "-cv-" + "00" + suffix3 if suffix_num<1000 & docket_set5==""
replace docket_set5 = prefix + ":" + suffix1 + "-cv-" + "0" + suffix3 if suffix_num<10000 & docket_set5==""
replace docket_set5 = prefix + ":" + suffix1 + "-cv-" + suffix3 if suffix_num<100000 & docket_set5==""

drop prefix suffix1 suffix_num suffix2 suffix3
replace docket_set5="" if set!=5


gen CivilAction = docket_set1
replace CivilAction= docket_set2 if CivilAction==""
replace CivilAction= docket_set3 if CivilAction==""
replace CivilAction=docket_set4 if CivilAction==""
replace CivilAction=docket_set5 if CivilAction==""


drop temp1 temp2 temp3 set docket_set1 docket_set2 docket_numbers_1 docket_numbers_2 docket_set3 docket_set4 suffix32 docket_set5
replace CivilAction=strtrim(CivilAction)

tab Year_Appeal_Filed if CivilAction==""

//This leaves 90 observations that need to be hand formatted. All the rest have been formatted into the 0:00-cv-00019 format.


save "temp.dta", replace

import excel "2022-06-23 Civil Action hand formatting.xlsx", sheet("Sheet1") firstrow clear // These are the 90 records that were reviewed for hand formatting.

save "2022-06-23 Civil Action hand formatting.dta", replace

use "temp.dta", clear

merge 1:1 PACER_ID using "2022-06-23 Civil Action hand formatting.dta", update replace

drop prefix4 prefix5 _merge

duplicates report CivilAction district_id // This reports the district court cases that show up multiple times in the appeal sets.

duplicates report CivilAction district_id DateFiledinCOA // This reports the district court cases that show up multiple times in the appeal set and which have appeals filed on identical days.

// It's worth noting that there are a lot of duplicate origins here - as shown by duplicates report districID CivilAction.  Looking at some of the actual appeals, they really do seem to be duplicates: eerything down to the notice of appeal is virtually identical but for some different CAFC docket numbers. Careful review indicates very slight differences in the notices of appeal.

merge 1:1 PACER_ID using "2022-04-10 CAFC Dockets.dta" // The next step is to merge the formatted Civil Action data back into the main appeal dockets dataset. 

drop _merge

rename Appeal_Dockets Appeal_Docket

rename CivilAction case_number // "case_number" is the term that's used in the USPTO dataset.

replace case_number = OriginatingCase if case_number=="" // this copies over the OriginatingCase data if there is otherwise no case number. In practice, this consists of appeals from origins other than district courts. 

save "CAFC Docket Dataset with formatted cv numbers.dta", replace

erase "temp.dta"
erase "2022-06-23 Civil Action hand formatting.dta"


**************************
/* Merge Appeal Docket dataset into USPTO Dataset */
**************************

use "USPTO Patent Litigation Dataset (prepared).dta", clear

merge 1:m case_number district_id using "CAFC Docket Dataset with formatted cv numbers.dta" // NOTE: this is a one-to-many merge.  That means that records in the district court docket dataset that match to an appeal may match to MORE than one appeal. The result is a dataset that is still unique with respect to appeal dockets but is not unique with respect to Civil Action (case_number) numbers. 

sort district_id case_number DateFiledinCOA
quietly by district_id case_number: gen dup = cond(_N==1,0,_n)
replace dup = dup-1 if dup!=0 // This identifies multiple instances of the same Civil Action number with the earliest-filed appeal having the lowest number (0)

**************************
/* Generate descriptive statistics of DCT to Appeal */
**************************

// Relationship of appeals to USPTO cases

tab dup if _merge==3 // This reports the number of civil actions that matched, as well as the number multiple appeals for a given civil action. 

tab Year_Appeal_Filed _merge if DistrictCourt==1 & Misc_Docket!=1 & Year_Appeal_Filed>2010 // Total number of matches

gen infringement_case = 1 if (case_type_1 == 1 | case_type_1 == 2 | case_type_1 == 3 | case_type_1 == 4)
replace infringement_case = 0 if case_row_id!=. & infringement_case==. // The records from the district court dataset that are not coded as case types 1-4
replace infringement_case = 0 if infringement_case==. & DistrictCourt==1 // The records from the appeal dataset that did not match to the USPTO dataset

tab Year_Appeal_Filed infringement_case if DistrictCourt==1 & Misc_Docket!=1 & Year_Appeal_Filed>2010


// Relationship of USPTO cases to appeals

gen case_filed_year = year(date_filed)
tab case_filed_year _merge if dup==0 & Misc_Docket!=1 & case_filed_year>2010 // This provides the number of cases in the USPTO dataset filed each year that have at least one appeal. 

tab district_id if case_filed_year>2010 & dup==0 & infringement_case==1 // This table is used in Part III.A. to identify the number of cases filed for each district.

keep if (_merge==2 | _merge==3)

drop _merge

save "combined district court and appellate dataset.dta", replace

**************************
/* Prepare appeal document dataset */
**************************

use "appeals 2022-06-24.dta", clear

keep if docType=="Rule 36" | docType=="Opinion"
keep if Replaced==""
keep if docDate < date("1/1/2022","MDY")


drop if uniqueID==30275 | uniqueID==28919 | uniqueID==27341

keep uniqueID Appeal_Dockets docType docDate PrecedentialStatus origin DispGeneral CloudLink

split Appeal_Dockets, gen(temp) parse(;)
reshape long temp, i(uniqueID) j(num)
drop if temp==""
replace temp=strtrim(temp)
rename temp Appeal_Docket
drop num

duplicates report Appeal_Docket
duplicates tag Appeal_Docket, gen(dup) // There are six duplicate appeal dockets in this set.
drop if dup >0
drop dup

save "temp.dta", replace

**************************
/* Merge appeal document dataset into district court + appeal docket dataset */
**************************

use "combined district court and appellate dataset.dta", clear

merge 1:1 Appeal_Docket using "temp.dta"

erase "temp.dta"

// As of the end of 2021, 13 appeal numbers from the document dataset do not match records in the docket dataset. This is expected - there are a small number of dockets that are under seal, but nonetheless had publicly released orders. 

// While the document dataset does have civil action numbers for the district court dockets, they add relatively little at this point because many are missing the division. Consequently, at present the best match is just using the district court docket numbers from the appeals dataset. The consequence is that we only have district court dockets for appeals filed in 2011 and later. 

**************************
/* Generate descriptive statistics of Appeals to Decision */
**************************

tab Year_Appeal_Filed _merge

tab Year_Appeal_Filed _merge if Year_Appeal_Filed>2007

sort uniqueID DateFiledinCOA // This code allows us to look at the data on a per-decision basis with the earliest appeal filed in the court for that decision coded as 0. 
quietly by uniqueID : gen dup_appeals = cond(_N==1,0,_n)
replace dup_appeals = dup_appeals -1 if dup_appeals!=0

tab Year_Appeal_Filed _merge if dup_appeals==0 & uniqueID!=. & Year_Appeal_Filed>2007 & Misc_Docket!=1 // This gives the number of unique opinions. 

tab Year_Appeal_Filed origin_PACER if Year_Appeal_Filed>2007 & Misc_Docket!=1, missing

gen docYear = year(docDate)

tab docYear origin_PACER if dup_appeals==0 & uniqueID!=. & docYear>2007 & Misc_Docket!=1 , missing // This gives the number of decisions by year of decision.

tab docYear origin if dup_appeals==0 & uniqueID!=. & docYear>2007 & Misc_Docket!=1, missing // this is the better set to use because it contains origin coding for pre-2011 appeals that resulted in decisions. 

gen appeal_to_decision_time = docDate - DateFiledinCOA

sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0

sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Opinion" & dup_appeals==0
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Rule 36" & dup_appeals==0

sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0 & origin=="DCT"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Opinion" & dup_appeals==0 & origin=="DCT"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Rule 36" & dup_appeals==0 & origin=="DCT"

sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0 & origin=="CAVC"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Opinion" & dup_appeals==0 & origin=="CAVC"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Rule 36" & dup_appeals==0 & origin=="CAVC"

sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0 & origin=="PTO"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Opinion" & dup_appeals==0 & origin=="PTO"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Rule 36" & dup_appeals==0 & origin=="PTO"

sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0 & origin=="CFC"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Opinion" & dup_appeals==0 & origin=="CFC"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Rule 36" & dup_appeals==0 & origin=="CFC"

sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0 & origin=="MSPB"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Opinion" & dup_appeals==0 & origin=="MSPB"
sum appeal_to_decision_time if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & docType=="Rule 36" & dup_appeals==0 & origin=="MSPB"


// Dispositions

tab DispGeneral if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0
tab  origin DispGeneral if uniqueID!=. & docYear>2007 & Misc_Docket!=1 & dup_appeals==0

**************************
/* Generate descriptive statistics of Districts to Appeals/Decisions to Decision */
**************************


// Origins of appeals

tab district_id if dup==0 & case_filed_year > 2010 & infringement_case==1 // This provides the frequency of patent infringement district court cases with an appeal by origin



// Cases with multiple appeals and decisions

duplicates report case_number district_id if case_filed_year > 2010 & case_filed_year < 2016 & infringement_case==1 & Appeal_Docket!=""

duplicates report uniqueID case_number district_id if case_filed_year > 2010 & infringement_case==1 & case_filed_year < 2016 & uniqueID!=. // This reports the number of appeals that involve the same district court case number but which were decided in the same opinion. For example, a patent infringement case might generate two appeals, but both appeals would be resolved in the same appellate decision. In that case, it would be counted as a duplicate. 

// To identify these, it's necessary to count each of the instances in which it's the same civil action number and same appellate decision.

sort uniqueID case_number district_id
quietly by uniqueID case_number district_id : gen dup_case_decision = cond(_N==1,0,_n)
replace dup_case_decision = dup_case_decision -1 if dup_case_decision!=0

duplicates report case_number district_id if case_filed_year > 2010 & case_filed_year < 2016 & infringement_case==1 & dup_case_decision==0 & uniqueID != . // This gives us the number of instances of the same case number being involved in multiple opinions. It counts each decision:district court case number pair once. 

duplicates tag case_number district_id, gen(dup_case)


//case_filed_year > 2010 & case_filed_year < 2016 & infringement_case==1 & uniqueID != . & dup_case>0


// Outcomes by origin

tab district_id DispGeneral if uniqueID!=. & case_filed_year > 2010 & infringement_case==1 & Misc_Docket!=1 & dup_appeals==0 // This provides the disposition of patent infringement appeals by origin_PACER



